#Import data set either using read.csv or RStudio's GUI

#Check the head of the data to see what we are looking at
head(train)
NA
NA
NA
NA
NA
NA
#Let's check to see if there are any N/A values in the data set
any(is.na(train))
[1] TRUE
library(Amelia)

missmap(train)

#According to the missmap from the Amelia library, Area Income appears to be the only one missing some data. Let's start visualizing the data first to see if Area Income has a major impact on whether or not someone clicked on the ads.

library(ggplot2)
pl <- ggplot(train, aes(Area.Income, Daily.Time.Spent.on.Site)) + geom_point(aes(color = factor(Clicked)))
pl

NA
NA
#Notice that income does not appear to play a major role in whether or not people clicked on the ads. People of all incomes, according to this plot, click on the ads. The more important factor here appears to be the daily time spent on the site. Let's examine another variable similar to this one that resides within the data frame. 

pl2 <- ggplot(train, aes(Daily.Internet.Usage, Daily.Time.Spent.on.Site)) + geom_point(aes(color = factor(Clicked)))
pl2

NA
NA
NA
NA
NA
#There is still a significant amount of clustering here, but in my opinion, it appears to be a tighter cluster in terms of the Area Income. Let's examine Daily Internet Usage mapped by Area Income.
pl3 <- ggplot(train, aes(Area.Income, Daily.Internet.Usage)) + geom_point(aes(color = factor(Clicked)))
pl3

#This plot looks pretty similar to the first one we examined. Given these two variables alone, we have a pretty good idea of whether or not a user clicked on the ads. Let's do more EDA then begin working on training our model.

pl4 <- ggplot(train, aes(Age)) + geom_histogram(fill = "blue", color = "black", alpha = 0.5, bins = 40)
pl4

pl5 <- ggplot(train, aes(Daily.Internet.Usage)) + geom_histogram(fill = "blue", color = "black", alpha = 0.5, bins = 40)
pl5

pl6 <- ggplot(train, aes(gender)) + geom_bar(aes(fill = factor(gender), alpha = 0.5))
pl6

pl7 <- ggplot(train, aes(Clicked, Area.Income)) + geom_boxplot(aes(group=Clicked, fill=factor(Clicked), alpha = 0.4))
ggplotly(pl7)
Removed 225 rows containing non-finite values (stat_boxplot).`group_by_()` is deprecated as of dplyr 0.7.0.
Please use `group_by()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
#We see from the plots above some very interesting information. The people that clicked on the ads spend less time on the internet in general and on the site each day. They also tend to have lower incomes, per the boxplot. Let's write a function to impute Area.Income into the missing spots.

#Using plotly, we can easily get the median Area Income to impute for each group: 62,430.55 where Clicked=0; 50,306.31 where Clicked=1.


 impute_income <- function(income, clicked) {
   out <- income
   for (i in 1:length(income)){
     if(is.na(income[i]))
    {
       if(clicked[i] == 0)
         {
            out[i] <- 62430.55
         }
       else
         {
            out[i] <- 50306.31
         }
    }
     else
    {
       out[i] <- income[i]
    }
   }
   return(out)
}

original.incomes <- train$Area.Income

fixed.incomes <- impute_income(train$Area.Income, train$Clicked)

train$Area.Income <- fixed.incomes
missmap(train, col = c("yellow", "black"))

#Great! Having no missing data is a good feeling. Now, let's start running some models. Begin with K-means and the relevant columns.
library(cluster)

df.relevant <- data.frame(train$Daily.Time.Spent.on.Site, train$Daily.Internet.Usage)

cluster.click <- kmeans(df.relevant, 2, nstart = 10)
library(cluster)

#Let's look at the clusplot!

clusplot(df.relevant, cluster.click$cluster, color = TRUE, shade = TRUE, labels = 0, lines = 0)

library(factoextra)

sil <- silhouette(cluster.click$cluster, dist(df.relevant))
fviz_silhouette(sil)

#Let's start the KNN model:
var(train2[, 1])
[1] 83416.67
var(train2[, 3])
[1] 249.0543
clicked <- train2[, 10]

knn.df <- data.frame(train2$Daily.Time.Spent.on.Site, train2$Daily.Internet.Usage)

knn.df <- cbind(knn.df, clicked)

knn.standardized <- scale(knn.df[, -3])


var(knn.standardized[, 1])
[1] 1
var(knn.standardized[, 2])
[1] 1
#Test - first 300 rows for test set
test.index <- 1:300
test.data <- knn.standardized[test.index, ]
test.clicked <- clicked[knn.test.index]

#Train
train.data <- knn.standardized[-test.index, ]
train.clicked <- clicked[-test.index] 
#Run the model
library(class)

predictions.clicked <- knn(train.data, test.data, train.clicked, k=3)

mean(test.clicked != predictions.clicked)
[1] 0.1633333
#Let's observe the model with other k-values
predictions.clicked <- NULL
error.rate <- NULL

for (i in 1:25) {
  predictions.clicked <- knn(train.data, test.data, train.clicked, k=i)
  error.rate[i] <- mean(test.clicked != predictions.clicked)
}
k.values <- 1:25

error.df <- data.frame(error.rate, k.values)

pl8 <- ggplot(error.df, aes(k.values, error.rate)) + geom_point() + geom_line(lty="dotted", color = "red")
ggplotly(pl8)

NA
#So, the model runs most accurately with k=3; so that is what we will choose.
final.predictions.clicked <- knn(train.data, test.data, train.clicked, k=3)

mean(test.clicked != final.predictions.clicked)
[1] 0.1633333
#The model is approximately 83.67% accurate in it's predicitons.
---
title: "R Notebook - K-means Practice - Bryan Honeck "
output: html_notebook
---


```{r}
#####
#The given dataset has 1,000 records of users with 10 attributes. The classification variable is whether or not they clicked on an advertisement, denoted 0 or 1 in the last column of the data frame. We are trying to predict which users are more likely to click on the advertisements.
#####


#Import data set either using read.csv or RStudio's GUI
#train <- read.csv("C:/Users/Bryan/Downloads/train.csv")

#Check the head of the data to see what the data looks like.
head(train)

```

```{r}
#Let's check to see if there are any N/A values in the data set.
any(is.na(train))
library(Amelia)

missmap(train)

```
```{r}
#According to the missmap from the Amelia library, Area Income appears to be the only one missing some data. Let's start visualizing the data first to see if Area Income has a major impact on whether or not someone clicked on the ads. Suppose it does; we will write a funciton that will impute age as accurately as possible.

library(ggplot2)
pl <- ggplot(train, aes(Area.Income, Daily.Time.Spent.on.Site)) + geom_point(aes(color = factor(Clicked)))
pl


```
```{r}
#Notice that income does not appear to play a major role in whether or not people clicked on the ads. People of all incomes, according to this plot, click on the ads. The more important factor here appears to be the daily time spent on the site. Let's examine another variable similar to this one that resides within the data frame. 

pl2 <- ggplot(train, aes(Daily.Internet.Usage, Daily.Time.Spent.on.Site)) + geom_point(aes(color = factor(Clicked)))
pl2
```

```{r}
#There is still a significant amount of clustering here, but in my opinion, it appears to be a tighter cluster in terms of the Area Income. Let's examine Daily Internet Usage mapped by Area Income.
pl3 <- ggplot(train, aes(Area.Income, Daily.Internet.Usage)) + geom_point(aes(color = factor(Clicked)))
pl3

```

```{r}
#This plot looks pretty similar to the first one we examined. Given these two variables alone, we have a pretty good idea of whether or not a user clicked on the ads. Let's do more EDA then begin working on training our model.
library(plotly)

pl4 <- ggplot(train, aes(Age)) + geom_histogram(fill = "blue", color = "black", alpha = 0.5, bins = 40)
pl4
pl5 <- ggplot(train, aes(Daily.Internet.Usage)) + geom_histogram(fill = "blue", color = "black", alpha = 0.5, bins = 40)
pl5
pl6 <- ggplot(train, aes(gender)) + geom_bar(aes(fill = factor(gender), alpha = 0.5))
pl6
pl7 <- ggplot(train, aes(Clicked, Area.Income)) + geom_boxplot(aes(group=Clicked, fill=factor(Clicked), alpha = 0.4))
ggplotly(pl7)
```
```{r}
#We see from the plots above some very interesting information. The people that clicked on the ads spend less time on the internet in general and on the site each day. They also tend to have lower incomes, per the boxplot. Let's write a function to impute Area.Income into the missing spots.

#Using plotly, we can easily get the median Area Income to impute for each group: 62,430.55 where Clicked=0; 50,306.31 where Clicked=1.


 impute_income <- function(income, clicked) {
   out <- income
   for (i in 1:length(income)){
     if(is.na(income[i]))
    {
       if(clicked[i] == 0)
         {
            out[i] <- 62430.55
         }
       else
         {
            out[i] <- 50306.31
         }
    }
     else
    {
       out[i] <- income[i]
    }
   }
   return(out)
}

original.incomes <- train$Area.Income

fixed.incomes <- impute_income(train$Area.Income, train$Clicked)

train$Area.Income <- fixed.incomes

#Let's check to see that it worked properly!

```

```{r}
missmap(train, col = c("yellow", "black"))
```
```{r}
#Great! Having no missing data is a good feeling. Now, let's start running some models. Begin with K-means and the relevant columns.
library(cluster)

df.relevant <- data.frame(train$Daily.Time.Spent.on.Site, train$Daily.Internet.Usage)

cluster.click <- kmeans(df.relevant, 2, nstart = 10)


```

```{r}
library(cluster)

#Let's look at the clusplot!

clusplot(df.relevant, cluster.click$cluster, color = TRUE, shade = TRUE, labels = 0, lines = 0)
```
```{r}
library(factoextra)

sil <- silhouette(cluster.click$cluster, dist(df.relevant))
fviz_silhouette(sil)
#Above 0.5; looks pretty good!
```


```{r}
#Let's start the KNN model:
var(train2[, 1])
var(train2[, 3])

clicked <- train2[, 10]

knn.df <- data.frame(train2$Daily.Time.Spent.on.Site, train2$Daily.Internet.Usage)

#CBIND the classification column
knn.df <- cbind(knn.df, clicked)

knn.standardized <- scale(knn.df[, -3])


var(knn.standardized[, 1])
var(knn.standardized[, 2])
```

```{r}
#Test - first 300 rows for test set
test.index <- 1:300
test.data <- knn.standardized[test.index, ]
test.clicked <- clicked[knn.test.index]

#Train
train.data <- knn.standardized[-test.index, ]
train.clicked <- clicked[-test.index] 

```


```{r}
#Run the model
library(class)

predictions.clicked <- knn(train.data, test.data, train.clicked, k=3)

mean(test.clicked != predictions.clicked)

```

```{r}
#Let's observe the model with other k-values
predictions.clicked <- NULL
error.rate <- NULL

for (i in 1:25) {
  predictions.clicked <- knn(train.data, test.data, train.clicked, k=i)
  error.rate[i] <- mean(test.clicked != predictions.clicked)
}

```

```{r}
k.values <- 1:25

error.df <- data.frame(error.rate, k.values)

pl8 <- ggplot(error.df, aes(k.values, error.rate)) + geom_point() + geom_line(lty="dotted", color = "red")
ggplotly(pl8)

```

```{r}
#So, the model runs most accurately with k=3; so that is what we will choose.
final.predictions.clicked <- knn(train.data, test.data, train.clicked, k=3)

mean(test.clicked != final.predictions.clicked)

#The model is approximately 83.67% accurate in it's predicitons.
```
















